{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import featuretools as ft\n",
    "from woodwork.logical_types import Categorical, NaturalLanguage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customer_id</th>\n",
       "      <th>invoice</th>\n",
       "      <th>invoice_date</th>\n",
       "      <th>stock_code</th>\n",
       "      <th>description</th>\n",
       "      <th>quantity</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>13085.0</td>\n",
       "      <td>489434</td>\n",
       "      <td>2009-12-01 07:45:00</td>\n",
       "      <td>85048</td>\n",
       "      <td>15CM CHRISTMAS GLASS BALL 20 LIGHTS</td>\n",
       "      <td>12</td>\n",
       "      <td>6.95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>13085.0</td>\n",
       "      <td>489434</td>\n",
       "      <td>2009-12-01 07:45:00</td>\n",
       "      <td>79323P</td>\n",
       "      <td>PINK CHERRY LIGHTS</td>\n",
       "      <td>12</td>\n",
       "      <td>6.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>13085.0</td>\n",
       "      <td>489434</td>\n",
       "      <td>2009-12-01 07:45:00</td>\n",
       "      <td>79323W</td>\n",
       "      <td>WHITE CHERRY LIGHTS</td>\n",
       "      <td>12</td>\n",
       "      <td>6.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>13085.0</td>\n",
       "      <td>489434</td>\n",
       "      <td>2009-12-01 07:45:00</td>\n",
       "      <td>22041</td>\n",
       "      <td>RECORD FRAME 7\" SINGLE SIZE</td>\n",
       "      <td>48</td>\n",
       "      <td>2.10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>13085.0</td>\n",
       "      <td>489434</td>\n",
       "      <td>2009-12-01 07:45:00</td>\n",
       "      <td>21232</td>\n",
       "      <td>STRAWBERRY CERAMIC TRINKET BOX</td>\n",
       "      <td>24</td>\n",
       "      <td>1.25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   customer_id invoice        invoice_date stock_code  \\\n",
       "0      13085.0  489434 2009-12-01 07:45:00      85048   \n",
       "1      13085.0  489434 2009-12-01 07:45:00     79323P   \n",
       "2      13085.0  489434 2009-12-01 07:45:00     79323W   \n",
       "3      13085.0  489434 2009-12-01 07:45:00      22041   \n",
       "4      13085.0  489434 2009-12-01 07:45:00      21232   \n",
       "\n",
       "                           description  quantity  price  \n",
       "0  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   6.95  \n",
       "1                   PINK CHERRY LIGHTS        12   6.75  \n",
       "2                  WHITE CHERRY LIGHTS        12   6.75  \n",
       "3         RECORD FRAME 7\" SINGLE SIZE         48   2.10  \n",
       "4       STRAWBERRY CERAMIC TRINKET BOX        24   1.25  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load data\n",
    "\n",
    "df = pd.read_csv(\"retail.csv\", parse_dates=[\"invoice_date\"])\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and entity set\n",
    "\n",
    "es = ft.EntitySet(id=\"data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\woodwork\\type_sys\\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\woodwork\\type_sys\\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n"
     ]
    }
   ],
   "source": [
    "# Add the data to the entity\n",
    "\n",
    "es = es.add_dataframe(\n",
    "    dataframe=df,              # the dataframe with the data\n",
    "    dataframe_name=\"data\",     # unique name to associate with this dataframe\n",
    "    index=\"rows\",              # column name to index the items\n",
    "    make_index=True,           # if true, create a new column with unique values\n",
    "    time_index=\"invoice_date\", # column containing time data\n",
    "    logical_types={\n",
    "        \"customer_id\": Categorical, # the id is numerical, but should be handled as categorical\n",
    "        \"invoice\": Categorical,\n",
    "        \"description\": NaturalLanguage, # we need to set this variable as text for ft to work\n",
    "    },\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Entityset: data\n",
       "  DataFrames:\n",
       "    data [Rows: 741301, Columns: 8]\n",
       "    invoices [Rows: 40505, Columns: 3]\n",
       "  Relationships:\n",
       "    data.invoice -> invoices.invoice"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create a new dataframe with invoices\n",
    "# indicating its relationship to the main data\n",
    "\n",
    "es.normalize_dataframe(\n",
    "    base_dataframe_name=\"data\",     # Datarame name from which to split.\n",
    "    new_dataframe_name=\"invoices\",  # Name of the new dataframe.\n",
    "    index=\"invoice\",                # relationship will be created across this column.\n",
    "    copy_columns=[\"customer_id\"],   # columns to remove from base_dataframe and move to new dataframe.\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the date related features we want to extract\n",
    "\n",
    "text_primitives = [\n",
    "    \"num_words\", \"num_characters\", \"MeanCharactersPerWord\" , \"PunctuationCount\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<Feature: customer_id>,\n",
       " <Feature: invoice>,\n",
       " <Feature: stock_code>,\n",
       " <Feature: quantity>,\n",
       " <Feature: price>,\n",
       " <Feature: MEAN_CHARACTERS_PER_WORD(description)>,\n",
       " <Feature: NUM_CHARACTERS(description)>,\n",
       " <Feature: NUM_WORDS(description)>,\n",
       " <Feature: PUNCTUATION_COUNT(description)>]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create datetime features\n",
    "\n",
    "feature_matrix, feature_defs = ft.dfs(\n",
    "    entityset=es,                       # the entity set\n",
    "    target_dataframe_name=\"data\",       # the dataframe for wich to create the feature\n",
    "    agg_primitives=[],                  # we need an empty list to avoid returning the defo parameters\n",
    "    trans_primitives=text_primitives,   # the date features to extract\n",
    "    ignore_dataframes=[\"invoices\"],\n",
    ")\n",
    "\n",
    "# display name of created features\n",
    "feature_defs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>NUM_CHARACTERS(description)</th>\n",
       "      <th>NUM_WORDS(description)</th>\n",
       "      <th>PUNCTUATION_COUNT(description)</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rows</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>35</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>18</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>28</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>30</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      NUM_CHARACTERS(description)  NUM_WORDS(description)  \\\n",
       "rows                                                        \n",
       "0                              35                       6   \n",
       "1                              18                       3   \n",
       "2                              20                       3   \n",
       "3                              28                       5   \n",
       "4                              30                       4   \n",
       "\n",
       "      PUNCTUATION_COUNT(description)  \n",
       "rows                                  \n",
       "0                                  0  \n",
       "1                                  0  \n",
       "2                                  0  \n",
       "3                                  1  \n",
       "4                                  0  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# dataframe with the new features\n",
    "text_f = [\n",
    "    \"NUM_CHARACTERS(description)\",\n",
    "    \"NUM_WORDS(description)\",\n",
    "    \"PUNCTUATION_COUNT(description)\",\n",
    "]\n",
    "\n",
    "feature_matrix[text_f].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# blog:\n",
    "# https://innovation.alteryx.com/natural-language-processing-featuretools/\n",
    "\n",
    "from nlp_primitives import (\n",
    "    DiversityScore,\n",
    "    MeanCharactersPerSentence,\n",
    "    PartOfSpeechCount,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_primitives = [\n",
    "    DiversityScore,\n",
    "#     MeanCharactersPerSentence,\n",
    "#     PartOfSpeechCount,\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<Feature: customer_id>,\n",
       " <Feature: invoice>,\n",
       " <Feature: stock_code>,\n",
       " <Feature: quantity>,\n",
       " <Feature: price>,\n",
       " <Feature: DIVERSITY_SCORE(description)>]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create datetime features\n",
    "\n",
    "feature_matrix, feature_defs = ft.dfs(\n",
    "    entityset=es,                       # the entity set\n",
    "    target_dataframe_name=\"data\",       # the dataframe for wich to create the feature\n",
    "    agg_primitives=[],                  # we need an empty list to avoid returning the defo parameters\n",
    "    trans_primitives=text_primitives,   # the date features to extract\n",
    "    ignore_dataframes=[\"invoices\"],\n",
    ")\n",
    "\n",
    "# display name of created features\n",
    "feature_defs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customer_id</th>\n",
       "      <th>invoice</th>\n",
       "      <th>stock_code</th>\n",
       "      <th>quantity</th>\n",
       "      <th>price</th>\n",
       "      <th>DIVERSITY_SCORE(description)</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rows</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>13085.0</td>\n",
       "      <td>489434</td>\n",
       "      <td>85048</td>\n",
       "      <td>12</td>\n",
       "      <td>6.95</td>\n",
       "      <td>0.833333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>13085.0</td>\n",
       "      <td>489434</td>\n",
       "      <td>79323P</td>\n",
       "      <td>12</td>\n",
       "      <td>6.75</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>13085.0</td>\n",
       "      <td>489434</td>\n",
       "      <td>79323W</td>\n",
       "      <td>12</td>\n",
       "      <td>6.75</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>13085.0</td>\n",
       "      <td>489434</td>\n",
       "      <td>22041</td>\n",
       "      <td>48</td>\n",
       "      <td>2.10</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>13085.0</td>\n",
       "      <td>489434</td>\n",
       "      <td>21232</td>\n",
       "      <td>24</td>\n",
       "      <td>1.25</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     customer_id invoice stock_code  quantity  price  \\\n",
       "rows                                                   \n",
       "0        13085.0  489434      85048        12   6.95   \n",
       "1        13085.0  489434     79323P        12   6.75   \n",
       "2        13085.0  489434     79323W        12   6.75   \n",
       "3        13085.0  489434      22041        48   2.10   \n",
       "4        13085.0  489434      21232        24   1.25   \n",
       "\n",
       "      DIVERSITY_SCORE(description)  \n",
       "rows                                \n",
       "0                         0.833333  \n",
       "1                         1.000000  \n",
       "2                         1.000000  \n",
       "3                         1.000000  \n",
       "4                         1.000000  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_matrix.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "fsml",
   "language": "python",
   "name": "fsml"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "165px"
   },
   "toc_section_display": "block",
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
